#' Read and transform raw MORG data
#'
#' Data is read from `.dta` files in `data.dir` defined outside the function
#' 
#' @param years Vector of years for which to read the data
#' @param varlist Vector of variable names to include in the returned data
#' @param drop.low Boolean, whether to drop observations with too low earnings
#'     (earning less than the 1980 minimum wage (converted to 2016 dollars))
#' @param drop.high Boolean, whether to drop observations with too high earnings
#'     (those who earn more than the current earnings top coded times 1.5 divided by 35 hours/week)
#' @param months Vector of month numbers indicating which months to include
#' @return List of data sets indexed by year


read.transform.data <- function(years,varlist,drop.low,drop.high,months){
    morg = list()
    for (year in years){
        message("=======================================================")
        message("Reading data for year ",year)
        message("=======================================================")
        year2digit = substr(as.character(year),3,4)
        data <- read_dta(paste(data.dir,"morg",year2digit,".dta",sep = ""))
        morg[[year]] <- transform.data(data,varlist,drop.low,drop.high,months)
        # drop original data
        rm(data)
        ind = grep("occ",names(morg[[year]]))
    }

    return(morg)
}

transform.data <- function(data,varlist,drop.low,drop.high,months){
    d.year <- data$year[[1]]
    message("=======================================================")
    message("Transforming data for year ",d.year)
    message("=======================================================")

    # filter out observations
    data.sample <- data %>% filter(
               (intmonth %in% months)&
               (age>=16&age<=64))
    message(nrow(data.sample)," observations left after restricting to given months and age")

    # windsorize topcoded earnings
    message("windsorizing top earnings")
    data.sample$earnwke.wnd <- data.sample$earnwke
    topcode <- max(data.sample$earnwke,na.rm=TRUE)
    data.sample$earnwke.wnd[data.sample$earnwke.wnd==topcode & !is.na(data.sample$earnwke.wnd)] <- 1.5*topcode

    message("imputing from Pareto distribution with alpha = 1.7692")
    data.sample$earnwke.pareto <- data.sample$earnwke
    topcode <- max(data.sample$earnwke,na.rm=TRUE)
    nobs <- length(data.sample$earnwke.pareto[data.sample$earnwke.pareto==topcode & !is.na(data.sample$earnwke.wnd)])

    set.seed(1)
    data.sample$earnwke.pareto[data.sample$earnwke.pareto==topcode& !is.na(data.sample$earnwke.wnd)] <- rpareto(nobs,topcode,1.7692)
    
    # generate hourly wage variable and rounded earning weights
    data.sample <- data.sample %>% mutate(hrwage = earnwke.wnd/uhourse,
                                   earnwt.rnd = round(earnwt), hrwage_pareto = earnwke.pareto/uhourse, hrwage_original = earnwke/uhourse)

    # Flag those who earn more than the current earnings top coded times 1.5 divided by 35 hours/week
    data.sample <- mutate(data.sample,
                          too_high_wage = ifelse(hrwage>(topcode*1.5/35),1,0))

    # convert to 2016 dollars
    data.sample <- data.sample %>% mutate(hrwage16 = hrwage*cpi$cpi16[cpi$year==d.year],
                                          earnwke16 = earnwke*cpi$cpi16[cpi$year==d.year],
                                          hrwage16_pareto = hrwage_pareto*cpi$cpi16[cpi$year==d.year],
                                          hrwage16_original = hrwage_original*cpi$cpi16[cpi$year==d.year])

    # Flag those who are earning less than the 1980 minimum wage (converted to 2016 dollars)
    # see https://www.dol.gov/whd/minwage/chart.htm for minwages
    min.wage <- 3.10 * cpi$cpi16[cpi$year==1980]
    data.sample <- mutate(data.sample,
                          too_low_wage = ifelse(hrwage16<min.wage,1,0))

    # merge occ1990d classification
    if ((d.year >= 1980) & (d.year <= 1982)){
        data.sample <- data.sample %>% mutate(occ = occ70) %>%
            left_join(occ1970.occ1990dd,by="occ")
    }
    else if ((d.year >= 1983) & (d.year <= 1999)){
        data.sample <- data.sample %>% mutate(occ = occ80) %>%
            left_join(occ1980.occ1990dd,by="occ")
    }
    else if ((d.year >= 2000) & (d.year <= 2010)){
        data.sample <- data.sample %>% mutate(occ = occ00) %>%
            left_join(occ00.occ1990,by="occ")

        # drop occ variable, rename occ1990 to occ
        data.sample <- data.sample %>% select(-occ) %>%
            rename(occ=occ1990)

        # merge occ 1990dd classification based on occ1990
        data.sample <-  data.sample %>% left_join(occ1990.occ1990dd,by="occ")
    }
    else if  ((d.year >= 2011) & (d.year <= 2012)){
        # merge occ1990 classification
        data.sample <- data.sample %>% mutate(occ = occ2011) %>%
            left_join(occ2012.occ1990,by="occ")

        # drop occ variable, rename occ1990 to occ
        data.sample <- data.sample %>% select(-occ) %>%
            rename(occ=occ1990)

        # merge occ 1990dd classification based on occ1990
        data.sample <-  data.sample %>% left_join(occ1990.occ1990dd,by="occ")
    }
    else if  ((d.year >= 2013) & (d.year <= 2016)){
        # merge occ1990 classification
        data.sample <- data.sample %>% mutate(occ = occ2012) %>%
            left_join(occ2012.occ1990,by="occ")

        # drop occ variable, rename occ1990 to occ
        data.sample <- data.sample %>% select(-occ) %>%
            rename(occ=occ1990)

        # merge occ 1990dd classification based on occ1990
        data.sample <-  data.sample %>% left_join(occ1990.occ1990dd,by="occ")
    }
    else{
        print("check year for classification")
    }

    # drop self-employed
    if (d.year <= 1993){
        data.sample <- data.sample %>% filter((class != 5) & (class != 6))
    }
    else{
        data.sample <- data.sample %>% filter((class94 != 6) & (class94 != 7))
    }
    message(nrow(data.sample)," observations left after dropping self-employed")

    # merge 2-digit classification
    data.sample <- data.sample %>% left_join(occ1990dd.2digit,by="occ1990dd")


    # classify into three groups, drop non-classified
    data.sample <-  data.sample %>% mutate(cat3 = ifelse(Recode >= 11 & Recode <= 13, 'abs',
                                                  ifelse(Recode >= 21 & Recode <= 31, 'rt',
                                                  ifelse(Recode >= 32 & Recode <= 34, 'svc',NA))))


    nobs <- nrow(data.sample)

    if  (drop.high == 1){
    # drop observations with too high and too low earnings
    data.sample <- filter(data.sample,too_high_wage==0)
    nobs.two <- nrow(data.sample)
    message(nobs-nobs.two," observations dropped due to too high earnings")
    }

    if (drop.low == 1){
    data.sample <- filter(data.sample,too_low_wage==0)
    nobs.three <- nrow(data.sample)
    message(nobs.two-nobs.three," observations dropped due to too low earnings")
    }

    # take care of education variable
    data.sample <- data.sample %>% mutate(ihigrdc = ifelse(year >= 1998,ihigrdc,NA))
    data.sample <- data.sample %>% mutate(gradeat = ifelse(year < 1992,gradeat,NA))
    data.sample <- data.sample %>% mutate(grade92 = ifelse(year >= 1992,grade92,NA))

    # select variables
    data.sample <- data.sample %>% select(one_of(varlist))


    return(data.sample)
}


